#coding: utf-8
import nltk
import os
import codecs
import random

filenames = [
    'data/mr/all.txt',
    'data/abc/all.txt',
    'data/brown/all.txt',
]
corpus_lists = [
    ['movie_reviews'],
    ['abc'],
    ['brown'],
]

for filename in filenames:
    for corpus_list in corpus_lists:
        if os.path.exists(os.path.dirname(filename)) is False:
            os.makedirs(os.path.dirname(filename))

        lines = []
        for corpus_name in corpus_list:
            corpus = getattr(nltk.corpus, corpus_name)
            for sent in corpus.sents():
                if len(sent) < 4:  # 不要这些
                    continue
                lines.append(' '.join(sent))

        random.shuffle(lines)

        with codecs.open(filename, 'w', encoding='utf-8') as f:
            for line in lines:
                f.write(line)
                f.write('\n')
